1 Executive Summary

2 Background: Airbnb in Beijing

3 Exploratory Data Analysis

First we have to download the data.

data <- vroom::vroom("listings.csv.gz") %>% 
  clean_names()

3.1 Raw Data Exploration

# Let's have a look at what we're dealing with!
glimpse(data)
## Rows: 36,283
## Columns: 106
## $ id                                           <dbl> 44054, 100213, 114384, 1…
## $ listing_url                                  <chr> "https://www.airbnb.com/…
## $ scrape_id                                    <dbl> 2.02e+13, 2.02e+13, 2.02…
## $ last_scraped                                 <date> 2020-06-20, 2020-06-20,…
## $ name                                         <chr> "Modern and Comfortable …
## $ summary                                      <chr> "East Apartments offers …
## $ space                                        <chr> "East Apartments is a we…
## $ description                                  <chr> "East Apartments offers …
## $ experiences_offered                          <chr> "none", "none", "none", …
## $ neighborhood_overview                        <chr> "The neighborhood is a p…
## $ notes                                        <chr> "*For long term reservat…
## $ transit                                      <chr> "The easiest method to g…
## $ access                                       <chr> "*Guests have access to …
## $ interaction                                  <chr> NA, NA, "Helen和Wendy会全程为…
## $ house_rules                                  <chr> "Registration All guests…
## $ thumbnail_url                                <lgl> NA, NA, NA, NA, NA, NA, …
## $ medium_url                                   <lgl> NA, NA, NA, NA, NA, NA, …
## $ picture_url                                  <chr> "https://a0.muscache.com…
## $ xl_picture_url                               <lgl> NA, NA, NA, NA, NA, NA, …
## $ host_id                                      <dbl> 192875, 527062, 533062, …
## $ host_url                                     <chr> "https://www.airbnb.com/…
## $ host_name                                    <chr> "East Apartments", "Joe"…
## $ host_since                                   <date> 2010-08-06, 2011-04-22,…
## $ host_location                                <chr> "Beijing, Beijing, China…
## $ host_about                                   <chr> "Hi everyone!  My name i…
## $ host_response_time                           <chr> "within an hour", "N/A",…
## $ host_response_rate                           <chr> "100%", "N/A", "100%", "…
## $ host_acceptance_rate                         <chr> "95%", "N/A", "100%", "1…
## $ host_is_superhost                            <lgl> FALSE, FALSE, FALSE, FAL…
## $ host_thumbnail_url                           <chr> "https://a0.muscache.com…
## $ host_picture_url                             <chr> "https://a0.muscache.com…
## $ host_neighbourhood                           <chr> "Shuangjing", NA, "ITC",…
## $ host_listings_count                          <dbl> 5, 4, 5, 5, 1, 7, 7, 6, …
## $ host_total_listings_count                    <dbl> 5, 4, 5, 5, 1, 7, 7, 6, …
## $ host_verifications                           <chr> "['email', 'phone', 'fac…
## $ host_has_profile_pic                         <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ host_identity_verified                       <lgl> FALSE, FALSE, FALSE, FAL…
## $ street                                       <chr> "Beijing, Beijing, China…
## $ neighbourhood                                <chr> "Chaoyang", NA, "ITC", "…
## $ neighbourhood_cleansed                       <chr> "朝阳区 / Chaoyang", "密云县 /…
## $ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA, …
## $ city                                         <chr> "Beijing", "Beijing", "B…
## $ state                                        <chr> "Beijing", "Beijing", "B…
## $ zipcode                                      <dbl> 100022, 101508, NA, 1000…
## $ market                                       <chr> "Beijing", "Other (Inter…
## $ smart_location                               <chr> "Beijing, China", "Beiji…
## $ country_code                                 <chr> "CN", "CN", "CN", "CN", …
## $ country                                      <chr> "China", "China", "China…
## $ latitude                                     <dbl> 39.9, 40.7, 39.9, 39.9, …
## $ longitude                                    <dbl> 116, 117, 116, 116, 116,…
## $ is_location_exact                            <lgl> TRUE, TRUE, TRUE, FALSE,…
## $ property_type                                <chr> "Serviced apartment", "G…
## $ room_type                                    <chr> "Entire home/apt", "Priv…
## $ accommodates                                 <dbl> 9, 2, 2, 2, 3, 2, 4, 2, …
## $ bathrooms                                    <dbl> 2, 1, 1, 1, 1, 1, 1, 1, …
## $ bedrooms                                     <dbl> 3, 1, 1, 1, 1, 1, 1, 1, …
## $ beds                                         <dbl> 4, 1, 1, 1, 2, 1, 2, 1, …
## $ bed_type                                     <chr> "Real Bed", "Real Bed", …
## $ amenities                                    <chr> "{TV,\"Cable TV\",Intern…
## $ square_feet                                  <dbl> 1464, NA, NA, NA, 323, N…
## $ price                                        <chr> "$835.00", "$1,203.00", …
## $ weekly_price                                 <chr> "$8,373.00", "$7,200.00"…
## $ monthly_price                                <chr> "$27,603.00", "$28,800.0…
## $ security_deposit                             <chr> "$708.00", "$0.00", NA, …
## $ cleaning_fee                                 <chr> "$71.00", "$0.00", NA, "…
## $ guests_included                              <dbl> 6, 1, 1, 1, 2, 1, 1, 2, …
## $ extra_people                                 <chr> "$71.00", "$0.00", "$0.0…
## $ minimum_nights                               <dbl> 2, 1, 1, 1, 3, 1, 1, 1, …
## $ maximum_nights                               <dbl> 365, 30, 730, 1125, 365,…
## $ minimum_minimum_nights                       <dbl> 2, 1, 1, 1, 3, 1, 1, 1, …
## $ maximum_minimum_nights                       <dbl> 2, 1, 1, 1, 3, 1, 1, 1, …
## $ minimum_maximum_nights                       <dbl> 365, 30, 730, 1125, 365,…
## $ maximum_maximum_nights                       <dbl> 365, 30, 730, 1125, 365,…
## $ minimum_nights_avg_ntm                       <dbl> 2, 1, 1, 1, 3, 1, 1, 1, …
## $ maximum_nights_avg_ntm                       <dbl> 365, 30, 730, 1125, 365,…
## $ calendar_updated                             <chr> "5 months ago", "27 mont…
## $ has_availability                             <lgl> TRUE, TRUE, TRUE, TRUE, …
## $ availability_30                              <dbl> 19, 0, 19, 19, 19, 2, 0,…
## $ availability_60                              <dbl> 49, 0, 49, 49, 49, 2, 0,…
## $ availability_90                              <dbl> 79, 0, 79, 79, 79, 2, 0,…
## $ availability_365                             <dbl> 354, 0, 354, 354, 169, 2…
## $ calendar_last_scraped                        <date> 2020-06-20, 2020-06-20,…
## $ number_of_reviews                            <dbl> 99, 2, 66, 10, 290, 26, …
## $ number_of_reviews_ltm                        <dbl> 7, 0, 1, 1, 22, 0, 2, 0,…
## $ first_review                                 <date> 2010-08-25, 2017-08-27,…
## $ last_review                                  <date> 2020-01-06, 2017-10-08,…
## $ review_scores_rating                         <dbl> 91, 100, 93, 98, 97, 77,…
## $ review_scores_accuracy                       <dbl> 9, 10, 10, 9, 10, 8, 8, …
## $ review_scores_cleanliness                    <dbl> 8, 9, 9, 9, 10, 7, 7, 8,…
## $ review_scores_checkin                        <dbl> 10, 10, 10, 9, 10, 9, 9,…
## $ review_scores_communication                  <dbl> 10, 10, 10, 10, 10, 9, 9…
## $ review_scores_location                       <dbl> 10, 9, 10, 10, 10, 9, 9,…
## $ review_scores_value                          <dbl> 9, 9, 10, 9, 10, 8, 9, 8…
## $ requires_license                             <lgl> FALSE, FALSE, FALSE, FAL…
## $ license                                      <chr> NA, NA, "Exempt", "Exemp…
## $ jurisdiction_names                           <lgl> NA, NA, NA, NA, NA, NA, …
## $ instant_bookable                             <lgl> FALSE, TRUE, TRUE, TRUE,…
## $ is_business_travel_ready                     <lgl> FALSE, FALSE, FALSE, FAL…
## $ cancellation_policy                          <chr> "strict_14_with_grace_pe…
## $ require_guest_profile_picture                <lgl> FALSE, FALSE, FALSE, FAL…
## $ require_guest_phone_verification             <lgl> FALSE, FALSE, FALSE, FAL…
## $ calculated_host_listings_count               <dbl> 5, 4, 5, 5, 1, 5, 5, 6, …
## $ calculated_host_listings_count_entire_homes  <dbl> 5, 0, 5, 5, 1, 5, 5, 5, …
## $ calculated_host_listings_count_private_rooms <dbl> 0, 3, 0, 0, 0, 0, 0, 1, …
## $ calculated_host_listings_count_shared_rooms  <dbl> 0, 1, 0, 0, 0, 0, 0, 0, …
## $ reviews_per_month                            <dbl> 0.83, 0.06, 0.73, 0.11, …

From this output we can see that we have + just over 36 thousand observations (or Airbnb listings) in Beijing in the data set + 106 different variables included in the data + these variables are a mixture of ‘double’, ‘character’, ‘logic’ and ‘date’ + straightaway we can see that some of our ‘price’ variables include dollar signs ($) and are down as ‘character’ variables rather than ‘double’ variables

Since this is a large data set with a lot going on, we will compute some summary statistics on key variables

3.2 Summary Statistics and Missing Values

  listings <- data %>% 
  
  #Lets pick the variables we need
  select(c(price,
           cleaning_fee,
           extra_people,
           room_type,
           property_type,
           number_of_reviews,
           review_scores_rating,
           longitude,
           latitude,
           neighbourhood,
           minimum_nights,
           guests_included,
           bathrooms,
           bedrooms,
           beds,
           accommodates,
           host_is_superhost,
           neighbourhood_cleansed,
           cancellation_policy,
           listing_url,
           is_location_exact,
           security_deposit,
           review_scores_cleanliness,
           instant_bookable,
           amenities,
           calculated_host_listings_count,
           reviews_per_month
           )
         ) %>% 

  #Removing dollar signs and changing into numerical variables
  
  mutate(
 
    #Changing Price from chr to dbl
    
    price = parse_number(price),
    
    #Changing Cleaning Fee from chr to dbl
    
    cleaning_fee = parse_number(cleaning_fee),
    
    #Changing Extra People fee from chr to dbl
    
    extra_people = parse_number(extra_people),
    
    #Changing Security Deposit from chr to dbl
    
    security_deposit = parse_number(security_deposit)
  )

Now that we have all the variables in the format required, we can move on to the quality of the data.

3.2.1 Removing Missing Values

# Check which variables have lots of missing values (NA's)
listings %>% 
  skim() %>% 
  kbl() %>% 
  kable_styling()
skim_type skim_variable n_missing complete_rate character.min character.max character.empty character.n_unique character.whitespace logical.mean logical.count numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
character room_type 0 1.000 11 15 0 3 0 NA NA NA NA NA NA NA NA NA NA
character property_type 0 1.000 3 22 0 45 0 NA NA NA NA NA NA NA NA NA NA
character neighbourhood 13370 0.632 3 36 0 61 0 NA NA NA NA NA NA NA NA NA NA
character neighbourhood_cleansed 0 1.000 3 16 0 16 0 NA NA NA NA NA NA NA NA NA NA
character cancellation_policy 0 1.000 8 27 0 3 0 NA NA NA NA NA NA NA NA NA NA
character listing_url 0 1.000 34 37 0 36283 0 NA NA NA NA NA NA NA NA NA NA
character amenities 0 1.000 2 1917 0 28222 0 NA NA NA NA NA NA NA NA NA NA
logical host_is_superhost 1 1.000 NA NA NA NA NA 0.264 FAL: 26711, TRU: 9571 NA NA NA NA NA NA NA NA
logical is_location_exact 0 1.000 NA NA NA NA NA 0.565 TRU: 20497, FAL: 15786 NA NA NA NA NA NA NA NA
logical instant_bookable 0 1.000 NA NA NA NA NA 0.643 TRU: 23333, FAL: 12950 NA NA NA NA NA NA NA NA
numeric price 0 1.000 NA NA NA NA NA NA NA 726.046 1861.040 0.00 255.00 396.00 651.00 70723.0 ▇▁▁▁▁
numeric cleaning_fee 23123 0.363 NA NA NA NA NA NA NA 60.943 218.669 0.00 0.00 40.00 70.00 10000.0 ▇▁▁▁▁
numeric extra_people 0 1.000 NA NA NA NA NA NA NA 20.474 79.101 0.00 0.00 0.00 0.00 2118.0 ▇▁▁▁▁
numeric number_of_reviews 0 1.000 NA NA NA NA NA NA NA 6.752 16.834 0.00 0.00 1.00 5.00 344.0 ▇▁▁▁▁
numeric review_scores_rating 16270 0.552 NA NA NA NA NA NA NA 94.789 10.836 20.00 94.00 100.00 100.00 100.0 ▁▁▁▁▇
numeric longitude 0 1.000 NA NA NA NA NA NA NA 116.442 0.258 115.47 116.34 116.43 116.50 117.5 ▁▁▇▁▁
numeric latitude 0 1.000 NA NA NA NA NA NA NA 40.022 0.235 39.46 39.90 39.94 40.05 41.0 ▁▇▁▂▁
numeric minimum_nights 0 1.000 NA NA NA NA NA NA NA 4.308 28.307 1.00 1.00 1.00 1.00 1086.0 ▇▁▁▁▁
numeric guests_included 0 1.000 NA NA NA NA NA NA NA 1.365 1.257 1.00 1.00 1.00 1.00 16.0 ▇▁▁▁▁
numeric bathrooms 21 0.999 NA NA NA NA NA NA NA 1.424 1.375 0.00 1.00 1.00 1.50 101.5 ▇▁▁▁▁
numeric bedrooms 142 0.996 NA NA NA NA NA NA NA 1.663 1.480 0.00 1.00 1.00 2.00 50.0 ▇▁▁▁▁
numeric beds 380 0.990 NA NA NA NA NA NA NA 2.242 2.754 0.00 1.00 1.00 2.00 115.0 ▇▁▁▁▁
numeric accommodates 0 1.000 NA NA NA NA NA NA NA 3.742 3.090 1.00 2.00 2.00 4.00 18.0 ▇▁▁▁▁
numeric security_deposit 23793 0.344 NA NA NA NA NA NA NA 655.045 2337.306 0.00 0.00 200.00 700.00 35362.0 ▇▁▁▁▁
numeric review_scores_cleanliness 16272 0.552 NA NA NA NA NA NA NA 9.518 1.065 2.00 9.00 10.00 10.00 10.0 ▁▁▁▁▇
numeric calculated_host_listings_count 0 1.000 NA NA NA NA NA NA NA 9.543 13.636 1.00 2.00 5.00 11.00 89.0 ▇▁▁▁▁
numeric reviews_per_month 15644 0.569 NA NA NA NA NA NA NA 0.649 0.850 0.01 0.14 0.31 0.81 22.9 ▇▁▁▁▁

Here we can see that has an extremely high number of missing values or values. This is most likely due to some properties including a cleaning fee within the price, and then not listing the cleaning fee as ‘$0’. A similar issue arises with security deposit. + In consumer psychology, additional costs are often viewed negatively

data_cleaned <- listings %>% 
  
  # In order to handle the high volume of NA's in cleaning_fee, we will change these values to a 0
  mutate(
    cleaning_fee = case_when(
      is.na(cleaning_fee) ~ 0,
      TRUE ~ cleaning_fee
        ),
  # We apply the same logic to the security_deposit variable
  
    security_deposit = case_when(
      is.na(security_deposit) ~ 0,
      TRUE ~ security_deposit
        ),
    reviews_per_month = case_when(
      is.na(reviews_per_month) ~0,
      TRUE ~ reviews_per_month
        ),
    wifi = case_when(
      str_detect(amenities, "Wifi") ~ TRUE,
      str_detect(amenities, "wifi") ~ TRUE,
      TRUE ~ FALSE
        ),
    breakfast = case_when(
      str_detect(amenities, "Breakfast") ~ TRUE,
      str_detect(amenities, "breakfast") ~ TRUE,
      TRUE ~ FALSE
        )
    )

# lets examine wifi and breakfast columns
data_cleaned %>% 
  select(c(price, wifi, breakfast))
## # A tibble: 36,283 x 3
##    price wifi  breakfast
##    <dbl> <lgl> <lgl>    
##  1   835 TRUE  FALSE    
##  2  1203 TRUE  TRUE     
##  3   602 TRUE  FALSE    
##  4   602 TRUE  FALSE    
##  5   411 TRUE  TRUE     
##  6   552 TRUE  FALSE    
##  7   601 TRUE  FALSE    
##  8   403 TRUE  FALSE    
##  9   743 TRUE  FALSE    
## 10   418 TRUE  FALSE    
## # … with 36,273 more rows
# Let's skim the cleaning_fee variable to see if we have succeeded
data_cleaned %>% 
skim(cleaning_fee) %>% 
  # the kable package is used to format the resulting tables in a more visually appealing way
  kbl() %>% 
  kable_styling()
skim_type skim_variable n_missing complete_rate numeric.mean numeric.sd numeric.p0 numeric.p25 numeric.p50 numeric.p75 numeric.p100 numeric.hist
numeric cleaning_fee 0 1 22.1 135 0 0 0 0 10000 ▇▁▁▁▁

3.3 Visualising The Data

3.3.1 Numerical Data

# Using patchwork to create a visualization of density for all numerical variables
p1 <- ggplot(data = data_cleaned, aes(x = price)) +
  geom_density() +
  theme_bw()

Before creating plots for all other numerical variables, let’s check the readability

p1

#Some of the x-axis for the data mean that it is difficult to get a full picture of the variability in the variables

p1a <- ggplot(data = data_cleaned, aes(x = price)) +
  geom_density() +
  
  #Here we add a limit to the x-axis, where the maximum value is 10000. We add this to most of the plots, where necessary
  
  xlim(0, 10000) +
  theme_bw() 

p2a <- ggplot(data = data_cleaned, aes(x = cleaning_fee)) +
  geom_density() +
  xlim(0, 300) +
  theme_bw() 

p3a <- ggplot(data = data_cleaned, aes(x = guests_included)) +
  geom_density() +
  xlim(0, 8) +
  theme_bw()

p4a <- ggplot(data = data_cleaned, aes(x = extra_people)) +
  geom_density() +
  xlim(0, 400) +
  theme_bw()

p5a <- ggplot(data = data_cleaned, aes(x = number_of_reviews)) +
  geom_density() +
  xlim(0, 100) +
  theme_bw()

p6a <- ggplot(data = data_cleaned, aes(x = review_scores_rating)) +
  geom_density() +
  xlim(0, 100) +
  theme_bw() 

p7a <- ggplot(data = data_cleaned, aes(x = minimum_nights)) +
  geom_density() +
  xlim(0, 150) +
  theme_bw() 

p8a <- ggplot(data = data_cleaned, aes(x = accommodates)) +
  geom_density() +
  theme_bw()

p9a <- ggplot(data = data_cleaned, aes(x = beds)) +
  geom_density() +
  xlim(0, 20) +
  theme_bw()

p10a <- ggplot(data = data_cleaned, aes(x = bathrooms)) +
  geom_density() +
  xlim(0, 20) +
  theme_bw()

p11a <- ggplot(data = data_cleaned, aes(x = bedrooms)) +
  geom_density() +
  xlim(0, 15) +
  theme_bw()

p1a + p2a + p3a + p4a + p5a + p6a + p7a + p8a + p9a + p10a + p11a

# using ggpairs to plot a correlation matrix
data_cleaned %>% 
  select(c(price, cleaning_fee, guests_included, 
           extra_people, number_of_reviews, review_scores_rating, 
           minimum_nights, accommodates, beds, bathrooms, bedrooms, security_deposit)
         ) %>% 
    ggpairs()

> Notable correlations with price are: 1. Accomodates (number of people the listing can accomodate) 2. Bedrooms (number of bedrooms at the listing) 3. Bathrooms (number of bathrooms at the listing) 4. Beds (number of beds at the listing) 5. Cleaning fee (additional flat cleaning fee) 6. Guests included (number of guests included in the price and exempt from fee) 7. Extra People (charge per night for each person over the )

Notable correlations between variables: 1. Accomodates/Beds/Bathrooms/Bedrooms/ - the greater the number of rooms, the greater the number of guests it can accommodate

These plots demonstrate????

3.3.2 Categorical Data

Some of the character variables have lots of different values, e.g. . Here we look at cleaning this to make it more manageable.

data_cleaned %>% 
  # Counting the frequency of property types
  count(property_type) %>% 
  # Arranging them into descending order by frequency
  arrange(desc(n))
## # A tibble: 45 x 2
##    property_type          n
##    <chr>              <int>
##  1 Apartment          14428
##  2 Condominium         4761
##  3 House               4129
##  4 Loft                2960
##  5 Serviced apartment  2189
##  6 Farm stay           1330
##  7 Villa               1222
##  8 Bungalow             985
##  9 Cottage              596
## 10 Townhouse            513
## # … with 35 more rows

We’re now classifying different types of properties into 5 groups - the 4 most prominent ones and remaining smaller categories labeled as ‘Other’.

cleaning <- data_cleaned %>%
      # creating a new variable 'prop_type_simplified' that groups property types into one of 5 categories. For example, "Boutique hotel" will now come under "Other"

  mutate(prop_type_simplified = case_when(
    
        # Here we specify that if property_type is equal to the top 4 types, then we pass through the property_type value
    
        property_type %in% c("Apartment","Condominium", "House","Loft") ~ property_type, 
        
        # This specifies that if the property_type value doesn't meet this criteria, the new variable will equal 'Other
        
        TRUE ~ "Other"
  ))

Now that our categorical variables are cleaned, we can inspect the variability as we did with the numerical variables, this time using bar plots. Plotting property types, room types, super host status and cancellation policy, to analyse their distribution.

# Simple ggplot code specifying x variable, visualisation type and theme
# using patchwork to plot distribution of different variables

p12 <- ggplot(data = cleaning, aes(x = prop_type_simplified)) +
  geom_bar() +
  theme_bw()

p13 <- ggplot(data = cleaning, aes(x = room_type)) +
  geom_bar() +
  theme_bw()

p14 <- ggplot(data = cleaning, aes(x = host_is_superhost)) +
  geom_bar() +
  theme_bw()

p15 <- ggplot(data = cleaning, aes(x = cancellation_policy)) +
  geom_bar() +
  theme_bw()

# Using patchwork to create a clean grid of the bar plots

p12 + p13 + p14 + p15

commentary needed on bar plots

3.3.3 Preliminary Correlation Analysis

#Here we can explore the correlation between our numerical variables

data_numerical <- data_cleaned %>% 
  
  #First we select the variables we want to plot against each other
  
  select(c(price, cleaning_fee, guests_included, extra_people, number_of_reviews, review_scores_rating, minimum_nights,
           accommodates, beds, bathrooms, bedrooms)) %>% 
  
  #Next we use the ggpairs function to plot a grid of scatter plots with correlation coefficients
  
  ggpairs() 

data_numerical

> Notable correlations with price are: 1. Accomodates (number of people the listing can accomodate) 1. Bedrooms (number of bedrooms at the listing) 1. Bathrooms (number of bathrooms at the listing) 1. Beds (number of beds at the listing) 1. Cleaning fee (additional flat cleaning fee) 1. Guests included (number of guests included in the price and exempt from fee) 1. Extra People (charge per night for each person over the )

Notable correlations between variables: 1. Accomodates/Beds/Bathrooms/Bedrooms/ - this makes sense because…???? 1.

3.4 Mapping

As we are looking at data over a geographical region, it can be helpful to see the geospatial spread of the Airbnb listings. Here we use the leaflet package to map our longitude and latitude data onto a map.

# Using the leaflet package

leaflet(data = filter(cleaning, minimum_nights <= 4)) %>% 
  
# Adding the map to lie beneath the data points
  
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  
# Adding our listing data as points on the map
  
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   fillColor = "blue", 
                   fillOpacity = 0.4, 
                   popup = ~listing_url,
                   label = ~property_type)

4 Regression Analysis

4.1 Preparation for Regression Analysis

In order to run a regression model, we will transform our price data into a approximately ‘normal’ distribution.

# We want to use log to transform our data into a more normal looking distribution of data, let's first see how the distribution would look

cleaning %>% 
  ggplot() +
  geom_density(aes(x = minimum_nights)) +
  
# Use this to transform the x-axis by log10  
  scale_x_log10()

cleaning
## # A tibble: 36,283 x 30
##    price cleaning_fee extra_people room_type property_type number_of_revie…
##    <dbl>        <dbl>        <dbl> <chr>     <chr>                    <dbl>
##  1   835           71           71 Entire h… Serviced apa…               99
##  2  1203            0            0 Private … Guest suite                  2
##  3   602            0            0 Entire h… Apartment                   66
##  4   602           30            0 Entire h… Apartment                   10
##  5   411           71          106 Entire h… House                      290
##  6   552            0            0 Entire h… Apartment                   26
##  7   601            0            0 Entire h… Apartment                   39
##  8   403            0           64 Entire h… Apartment                   30
##  9   743          283            0 Entire h… Apartment                  117
## 10   418           35           80 Entire h… Apartment                    3
## # … with 36,273 more rows, and 24 more variables: review_scores_rating <dbl>,
## #   longitude <dbl>, latitude <dbl>, neighbourhood <chr>, minimum_nights <dbl>,
## #   guests_included <dbl>, bathrooms <dbl>, bedrooms <dbl>, beds <dbl>,
## #   accommodates <dbl>, host_is_superhost <lgl>, neighbourhood_cleansed <chr>,
## #   cancellation_policy <chr>, listing_url <chr>, is_location_exact <lgl>,
## #   security_deposit <dbl>, review_scores_cleanliness <dbl>,
## #   instant_bookable <lgl>, amenities <chr>,
## #   calculated_host_listings_count <dbl>, reviews_per_month <dbl>, wifi <lgl>,
## #   breakfast <lgl>, prop_type_simplified <chr>

As we are looking to model the price of an Airbnb in Beijing for travel/tourism, we should look into the minimum_nights variable. This variable states the minimum number of nights you are able to to book the listing for.

# Visualise the frequency of minimum nights

# arranging listings by minimum_nights
cleaning %>% 
  count(minimum_nights) %>% 
  
# Arrange in descending order of frequency
  
  arrange(desc(n))
## # A tibble: 66 x 2
##    minimum_nights     n
##             <dbl> <int>
##  1              1 30216
##  2              2  2178
##  3              3  1024
##  4             30   819
##  5              7   369
##  6              5   368
##  7             15   316
##  8             90   175
##  9             10   161
## 10             60    89
## # … with 56 more rows
# calculating summary statistics for the distribution of minimum_nights
favstats(data = cleaning , ~ minimum_nights) %>% 
  kbl() %>% 
  kable_styling()
min Q1 median Q3 max mean sd n missing
1 1 1 1 1086 4.31 28.3 36283 0

From the above, we can infer the following - + The most common values for ‘minimum nights’ are 1 to 3 nights as they account for 92.1% of total listings. The next biggest category is ‘30 minimum nights’ (2.26% of total listings) + 30 minimum nights seem rather strange - maybe the people booking the Airbnbs are visiting Beijing for reasons other than leisure/ travel. For example, they may prefer Airbnbs as a budget friendly alternative to hotels for longer stays intended for business-related work, etc. + There are 61 listings for minimum nights of 365 days (1 year) as well which implies that some Airbnbs are more for the purpose of long-term renting or sub-letting.

4.2 Creating Variable to Model

neighbourhoodring <- vroom::vroom("neighbourhoodring.csv")

regression_data <-  cleaning %>% 
  # filter for minimum nights at most 4
  filter(minimum_nights<=4) %>% 
  # New variable that computes the price of 2 people booking an Airbnb for 4 nights
  # Note: extra_people charge per 1 extra person applied per night when no. of guests > guests_included
  left_join(., neighbourhoodring, by = "neighbourhood", copy = TRUE) %>%
  mutate(price_for_4_notlog = case_when(
      guests_included < 2 ~ cleaning_fee + (4 * (price + extra_people)),
      TRUE ~ cleaning_fee + (4 * price)
    ),
    price_4_nights = log(price_for_4_notlog + 0.00001),
    #New variable that classifies neighborhood into 5 areas according to Beijing's geographical characteristic
    #The 5 areas are Ring 2-6  
    neighbourhood_simplified = case_when(
      Ring == "2" ~ "Ring 2",
      Ring == "3" ~ "Ring 3",
      Ring == "4" ~ "Ring 4",
      Ring == "5" ~ "Ring 5",
      TRUE ~ "Ring 6"
    )) %>% 
  subset(., select = -Ring)
  
  regression_data
## # A tibble: 33,497 x 33
##    price cleaning_fee extra_people room_type property_type number_of_revie…
##    <dbl>        <dbl>        <dbl> <chr>     <chr>                    <dbl>
##  1   835           71           71 Entire h… Serviced apa…               99
##  2  1203            0            0 Private … Guest suite                  2
##  3   602            0            0 Entire h… Apartment                   66
##  4   602           30            0 Entire h… Apartment                   10
##  5   411           71          106 Entire h… House                      290
##  6   552            0            0 Entire h… Apartment                   26
##  7   601            0            0 Entire h… Apartment                   39
##  8   403            0           64 Entire h… Apartment                   30
##  9   743          283            0 Entire h… Apartment                  117
## 10   418           35           80 Entire h… Apartment                    3
## # … with 33,487 more rows, and 27 more variables: review_scores_rating <dbl>,
## #   longitude <dbl>, latitude <dbl>, neighbourhood <chr>, minimum_nights <dbl>,
## #   guests_included <dbl>, bathrooms <dbl>, bedrooms <dbl>, beds <dbl>,
## #   accommodates <dbl>, host_is_superhost <lgl>, neighbourhood_cleansed <chr>,
## #   cancellation_policy <chr>, listing_url <chr>, is_location_exact <lgl>,
## #   security_deposit <dbl>, review_scores_cleanliness <dbl>,
## #   instant_bookable <lgl>, amenities <chr>,
## #   calculated_host_listings_count <dbl>, reviews_per_month <dbl>, wifi <lgl>,
## #   breakfast <lgl>, prop_type_simplified <chr>, price_for_4_notlog <dbl>,
## #   price_4_nights <dbl>, neighbourhood_simplified <chr>
  # ggplot for price of four nights
ggplot(data = regression_data, aes(x = price_for_4_notlog)) +

  geom_histogram() +
  xlim(0, 40000)

# ggplot for log of price of four nights
ggplot(data = regression_data, aes(x = price_4_nights)) +
  geom_density() 

      # we use loggy-loggy to effectively change the case from a unit change to a percentage change

# look at cleaned data for regression models
glimpse(regression_data)
## Rows: 33,497
## Columns: 33
## $ price                          <dbl> 835, 1203, 602, 602, 411, 552, 601, 40…
## $ cleaning_fee                   <dbl> 71, 0, 0, 30, 71, 0, 0, 0, 283, 35, 0,…
## $ extra_people                   <dbl> 71, 0, 0, 0, 106, 0, 0, 64, 0, 80, 63,…
## $ room_type                      <chr> "Entire home/apt", "Private room", "En…
## $ property_type                  <chr> "Serviced apartment", "Guest suite", "…
## $ number_of_reviews              <dbl> 99, 2, 66, 10, 290, 26, 39, 30, 117, 3…
## $ review_scores_rating           <dbl> 91, 100, 93, 98, 97, 77, 86, 83, 99, 1…
## $ longitude                      <dbl> 116, 117, 116, 116, 116, 116, 116, 116…
## $ latitude                       <dbl> 39.9, 40.7, 39.9, 39.9, 39.9, 39.9, 39…
## $ neighbourhood                  <chr> "Chaoyang", NA, "ITC", "Chaoyang", "Do…
## $ minimum_nights                 <dbl> 2, 1, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 2,…
## $ guests_included                <dbl> 6, 1, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 4,…
## $ bathrooms                      <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1,…
## $ bedrooms                       <dbl> 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2,…
## $ beds                           <dbl> 4, 1, 1, 1, 2, 1, 2, 1, 3, 1, 1, 1, 2,…
## $ accommodates                   <dbl> 9, 2, 2, 2, 3, 2, 4, 2, 4, 3, 2, 2, 6,…
## $ host_is_superhost              <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALS…
## $ neighbourhood_cleansed         <chr> "朝阳区 / Chaoyang", "密云县 / Miyun", "朝阳区 …
## $ cancellation_policy            <chr> "strict_14_with_grace_period", "strict…
## $ listing_url                    <chr> "https://www.airbnb.com/rooms/44054", …
## $ is_location_exact              <lgl> TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, …
## $ security_deposit               <dbl> 708, 0, 0, 0, 0, 0, 0, 700, 0, 1000, 9…
## $ review_scores_cleanliness      <dbl> 8, 9, 9, 9, 10, 7, 7, 8, 10, 7, 7, 9, …
## $ instant_bookable               <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, …
## $ amenities                      <chr> "{TV,\"Cable TV\",Internet,Wifi,\"Air …
## $ calculated_host_listings_count <dbl> 5, 4, 5, 5, 1, 5, 5, 6, 1, 8, 6, 8, 1,…
## $ reviews_per_month              <dbl> 0.83, 0.06, 0.73, 0.11, 2.63, 0.24, 0.…
## $ wifi                           <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
## $ breakfast                      <lgl> FALSE, TRUE, FALSE, FALSE, TRUE, FALSE…
## $ prop_type_simplified           <chr> "Other", "Other", "Apartment", "Apartm…
## $ price_for_4_notlog             <dbl> 3411, 4812, 2408, 2438, 1715, 2208, 24…
## $ price_4_nights                 <dbl> 8.13, 8.48, 7.79, 7.80, 7.45, 7.70, 7.…
## $ neighbourhood_simplified       <chr> "Ring 4", "Ring 6", "Ring 3", "Ring 4"…

4.2.1 Building Linear Regression Models

# model 1 with a few variables - reviews and property types
model1 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating,
             regression_data)

model1 %>% tidy(conf.int=TRUE) 
## # A tibble: 7 x 7
##   term                 estimate std.error statistic   p.value conf.low conf.high
##   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)           6.91     0.0486      142.   0.         6.81      7.00   
## 2 prop_type_simplifie… -0.0594   0.0165       -3.60 3.15e-  4 -0.0918   -0.0271 
## 3 prop_type_simplifie…  0.204    0.0184       11.1  2.54e- 28  0.168     0.240  
## 4 prop_type_simplifie…  0.101    0.0200        5.07 4.11e-  7  0.0621    0.140  
## 5 prop_type_simplifie…  0.455    0.0148       30.7  1.47e-202  0.426     0.484  
## 6 number_of_reviews    -0.00204  0.000266     -7.66 1.88e- 14 -0.00256  -0.00152
## 7 review_scores_rating  0.00447  0.000508      8.80 1.45e- 18  0.00348   0.00547
model1 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.067 0.067 0.762 223 0 6 -21392 42800 42863 10832 18636 18643

Here, property type is a categorical variable - it has five categories and therefore makes up 4 dummy variables in the regression model. For example, the intercept term for ‘Apartment’ would just be ~ 6.91. For ‘House’, prop_type_simplifiedHouse = 1 (prop_type_simplifiedCondominium = 0 and prop_type_simplifiedOther = 0) and the intercept term would be 6.91 + 0.2 ~ 7.11. For ‘Other’, prop_type_simplifiedOther = 1 (prop_type_simplifiedCondominium = 0 and prop_type_simplifiedHouse = 0) and the intercept term would be 6.91 + 0.46 ~ 7.37. Therefore, relative to apartments, price_4_nights will be higher for houses and lofts but lower for condominiums.

(Note: our Y variable is in log, so the coefficient of all X variables represent percentage change in price_4_nights per unit change in whichever X variable we’re looking at)

Other variables such as number_of_reviews and review_scores_rating are statistically significant and explain the variation in price_4_nights, however, a point worth noting is that additional number_of_reviews do not lead to an increase in cost for 4 nights as the reviews may not necessarily be good reviews. On the other hand, review_scores_rating has a positive effect on price_4_nights which means that properties with a higher score/ rating would be more pricey.

# model 2 = model 1 + room type
model2 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + room_type, 
             regression_data)

model2 %>% tidy(conf.int=TRUE)  
## # A tibble: 9 x 7
##   term                  estimate std.error statistic  p.value conf.low conf.high
##   <chr>                    <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)            7.12     0.0427      167.   0.        7.04     7.20    
## 2 prop_type_simplified… -0.0321   0.0145       -2.22 2.63e- 2 -0.0605  -0.00378 
## 3 prop_type_simplified…  0.277    0.0162       17.1  3.51e-65  0.245    0.309   
## 4 prop_type_simplified… -0.0315   0.0176       -1.79 7.35e- 2 -0.0660   0.00300 
## 5 prop_type_simplified…  0.530    0.0131       40.6  0.        0.504    0.555   
## 6 number_of_reviews     -0.00137  0.000234     -5.85 5.15e- 9 -0.00182 -0.000908
## 7 review_scores_rating   0.00480  0.000445     10.8  6.09e-27  0.00392  0.00567 
## 8 room_typePrivate room -0.671    0.0109      -61.6  0.       -0.692   -0.649   
## 9 room_typeShared room  -1.21     0.0232      -52.1  0.       -1.26    -1.17
model2 %>% glance()  %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.284 0.284 0.668 924 0 8 -18924 37867 37946 8312 18634 18643

From the above table, we know that room_type has a very significant impact on price_4_nights as adjusted R-squared for model 2 is more than 4 times the adjusted R-squared for model 1. Room type is also a categorical variable with 3 categories, and hence makes up 2 dummy variables in the regression model.

We notice that the t-stat values for other variables which were already present in model 1, have further increased in model 2 indicating that there may be some multicollinearity between the variables. To check if that’s the case, we’ll calculate VIF.

vif(model2)
##                      GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.04  4            1.01
## number_of_reviews    1.01  1            1.01
## review_scores_rating 1.01  1            1.00
## room_type            1.04  2            1.01

4.2.2 Comparing model1 and model2

# creating a huxtable for summary of two models
huxreg(model1, model2,
       statistics = c('#observations' = 'nobs', 
                      'R squared' = 'r.squared', 
                      'Adj. R Squared' = 'adj.r.squared', 
                      'Residual SE' = 'sigma'), 
       bold_signif = 0.05, 
       stars = NULL
) %>% 
  set_caption('Comparison of Models 1.0')
Comparison of Models 1.0
(1)(2)
(Intercept)6.909 7.119 
(0.049)(0.043)
prop_type_simplifiedCondominium-0.059 -0.032 
(0.016)(0.014)
prop_type_simplifiedHouse0.204 0.277 
(0.018)(0.016)
prop_type_simplifiedLoft0.101 -0.032 
(0.020)(0.018)
prop_type_simplifiedOther0.455 0.530 
(0.015)(0.013)
number_of_reviews-0.002 -0.001 
(0.000)(0.000)
review_scores_rating0.004 0.005 
(0.001)(0.000)
room_typePrivate room     -0.671 
     (0.011)
room_typeShared room     -1.211 
     (0.023)
#observations18643     18643     
R squared0.067 0.284 
Adj. R Squared0.067 0.284 
Residual SE0.762 0.668 

4.2.3 exploring more variables

Previously, we plotted a correlation matrix to see which variables can be added to our regression model.

# model 3 = model 2 + beds, baths, bedrooms and no. of guests property can accommodate
model3 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates, 
             regression_data
            )

model3 %>% tidy(conf.int=TRUE)
## # A tibble: 13 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)         6.73      0.0371      181.   0.         6.65e+0   6.80e+0
##  2 prop_type_simplif… -0.0361    0.0124       -2.91 3.68e-  3 -6.04e-2  -1.17e-2
##  3 prop_type_simplif…  0.123     0.0140        8.78 1.77e- 18  9.57e-2   1.51e-1
##  4 prop_type_simplif… -0.0673    0.0152       -4.44 9.02e-  6 -9.71e-2  -3.76e-2
##  5 prop_type_simplif…  0.265     0.0117       22.6  1.06e-111  2.42e-1   2.87e-1
##  6 number_of_reviews  -0.000362  0.000201     -1.80 7.13e-  2 -7.56e-4   3.14e-5
##  7 review_scores_rat…  0.00326   0.000385      8.46 2.79e- 17  2.50e-3   4.01e-3
##  8 room_typePrivate … -0.416     0.00999     -41.6  0.        -4.35e-1  -3.96e-1
##  9 room_typeShared r… -0.919     0.0208      -44.2  0.        -9.60e-1  -8.79e-1
## 10 bedrooms            0.0805    0.00723      11.1  1.21e- 28  6.63e-2   9.46e-2
## 11 bathrooms           0.0302    0.00428       7.05 1.88e- 12  2.18e-2   3.85e-2
## 12 beds               -0.0308    0.00337      -9.15 6.53e- 20 -3.74e-2  -2.42e-2
## 13 accommodates        0.112     0.00306      36.6  8.29e-284  1.06e-1   1.18e-1
model3 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.474 0.474 0.573 1395 0 12 -15992 32012 32121 6086 18559 18572
# using VIF to check for multicollinearity
vif(model3)
##                      GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.15  4            1.02
## number_of_reviews    1.02  1            1.01
## review_scores_rating 1.01  1            1.01
## room_type            1.26  2            1.06
## bedrooms             4.39  1            2.10
## bathrooms            1.62  1            1.27
## beds                 3.12  1            1.77
## accommodates         4.42  1            2.10

In the table above, we can see that VIF for bedrooms, beds and accommodates is high. It is not a problem as such since their VIF is still less than 5 but compared to other variables, higher VIF is expected because more the number of beds and bedrooms, higher the number of guests the property can accommodate. So there is some correlation between these variables.

Does price of a property vary significantly if host is a Superhost?

Superhosts are experienced hosts who are most dedicated to providing outstanding hospitality to their guests. They need to maintain certain standards in response rate, cancellation rate and overall rating to earn this badge. From that perspective, we hypothesize that other factors remaining constant, a Superhost will charge prices higher than the average host. Let’s see if that’s true.

# model5 = model 4 + superhost status
model5 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost, 
             regression_data
            )

model5 %>% tidy(conf.int=TRUE)
## # A tibble: 14 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          6.76e+0  0.0373      181.   0.         6.68     6.83    
##  2 prop_type_simplifi… -3.80e-2  0.0124       -3.06 2.22e-  3 -0.0623  -0.0136  
##  3 prop_type_simplifi…  1.25e-1  0.0140        8.90 6.09e- 19  0.0973   0.152   
##  4 prop_type_simplifi… -7.02e-2  0.0152       -4.63 3.60e-  6 -0.0999  -0.0405  
##  5 prop_type_simplifi…  2.66e-1  0.0117       22.8  3.67e-113  0.243    0.289   
##  6 number_of_reviews   -7.03e-4  0.000207     -3.40 6.79e-  4 -0.00111 -0.000298
##  7 review_scores_rati…  2.74e-3  0.000392      6.99 2.90e- 12  0.00197  0.00351 
##  8 room_typePrivate r… -4.16e-1  0.00998     -41.7  0.        -0.436   -0.397   
##  9 room_typeShared ro… -9.15e-1  0.0208      -44.0  0.        -0.956   -0.874   
## 10 bedrooms             8.20e-2  0.00723      11.3  9.78e- 30  0.0679   0.0962  
## 11 bathrooms            2.97e-2  0.00427       6.95 3.73e- 12  0.0213   0.0381  
## 12 beds                -3.07e-2  0.00337      -9.12 8.00e- 20 -0.0373  -0.0241  
## 13 accommodates         1.12e-1  0.00306      36.6  9.70e-283  0.106    0.118   
## 14 host_is_superhostT…  6.21e-2  0.00916       6.78 1.24e- 11  0.0441   0.0800
model5 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.476 0.475 0.572 1295 0 13 -15968 31966 32083 6070 18557 18571

Our hypothesis seems to be true; host_is_superhost is significant as per its t-stat and p-value. One can expect the price for a Superhost’s property to be higher than an average host’s property by 0.062%

Is Location Exact?

Some hosts specify the exact location of their property; let’s see if that has any effect on the price for 4 nights.

model6 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact, 
             regression_data
            )

model6 %>% tidy(conf.int=TRUE)
## # A tibble: 15 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          6.82e+0  0.0379      180.   0.         6.74     6.89    
##  2 prop_type_simplifi… -3.86e-2  0.0124       -3.12 1.83e-  3 -0.0629  -0.0143  
##  3 prop_type_simplifi…  1.12e-1  0.0141        7.95 1.92e- 15  0.0842   0.139   
##  4 prop_type_simplifi… -6.83e-2  0.0151       -4.51 6.37e-  6 -0.0979  -0.0386  
##  5 prop_type_simplifi…  2.52e-1  0.0118       21.4  2.41e-100  0.229    0.275   
##  6 number_of_reviews   -8.97e-4  0.000208     -4.32 1.54e-  5 -0.00130 -0.000491
##  7 review_scores_rati…  2.70e-3  0.000391      6.90 5.24e- 12  0.00193  0.00347 
##  8 room_typePrivate r… -4.24e-1  0.00999     -42.4  0.        -0.444   -0.404   
##  9 room_typeShared ro… -9.19e-1  0.0207      -44.3  0.        -0.960   -0.878   
## 10 bedrooms             8.04e-2  0.00722      11.1  1.03e- 28  0.0662   0.0945  
## 11 bathrooms            2.85e-2  0.00427       6.68 2.48e- 11  0.0201   0.0369  
## 12 beds                -3.00e-2  0.00336      -8.94 4.08e- 19 -0.0366  -0.0235  
## 13 accommodates         1.11e-1  0.00305      36.3  5.98e-279  0.105    0.117   
## 14 host_is_superhostT…  6.74e-2  0.00915       7.37 1.83e- 13  0.0495   0.0854  
## 15 is_location_exactT… -7.98e-2  0.00872      -9.15 6.30e- 20 -0.0969  -0.0627
model6 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.478 0.478 0.571 1213 0 14 -15926 31884 32010 6043 18556 18571

Well, the variable is_location_exact seems to be significant as per its t-stat and p-value however the negative coefficient is surprising. Maybe that has something to do - not with whether the location specified is exact, but with what the location is!

For this purpose, let us include neighbourhood location into our regression model. To make things simple, we created a new variable called neighbourhood_simplified which groups different listings into broader categories or rings.

# adding neighbourhood location 
model7 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost + 
               is_location_exact + neighbourhood_simplified,
              regression_data
             )

model7 %>% tidy(conf.int=TRUE)
## # A tibble: 19 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          6.96     0.0386      181.   0.         6.89      7.04   
##  2 prop_type_simplifi… -0.0335   0.0122       -2.74 6.18e-  3 -0.0574   -0.00951
##  3 prop_type_simplifi…  0.0995   0.0140        7.13 1.08e- 12  0.0721    0.127  
##  4 prop_type_simplifi… -0.0268   0.0154       -1.74 8.15e-  2 -0.0570    0.00335
##  5 prop_type_simplifi…  0.264    0.0120       21.9  4.34e-105  0.240     0.288  
##  6 number_of_reviews   -0.00171  0.000208     -8.20 2.54e- 16 -0.00211  -0.00130
##  7 review_scores_rati…  0.00310  0.000387      8.03 1.05e- 15  0.00235   0.00386
##  8 room_typePrivate r… -0.422    0.00990     -42.6  0.        -0.441    -0.402  
##  9 room_typeShared ro… -0.934    0.0206      -45.3  0.        -0.975    -0.894  
## 10 bedrooms             0.0971   0.00716      13.6  1.16e- 41  0.0830    0.111  
## 11 bathrooms            0.0355   0.00422       8.41 4.52e- 17  0.0272    0.0438 
## 12 beds                -0.0307   0.00331      -9.28 1.96e- 20 -0.0372   -0.0242 
## 13 accommodates         0.106    0.00302      35.1  1.09e-261  0.100     0.112  
## 14 host_is_superhostT…  0.0633   0.00903       7.01 2.50e- 12  0.0456    0.0810 
## 15 is_location_exactT… -0.0792   0.00863      -9.17 5.04e- 20 -0.0961   -0.0623 
## 16 neighbourhood_simp… -0.202    0.0146      -13.9  2.06e- 43 -0.231    -0.174  
## 17 neighbourhood_simp… -0.184    0.0131      -14.1  1.14e- 44 -0.210    -0.158  
## 18 neighbourhood_simp… -0.206    0.0374       -5.51 3.56e-  8 -0.280    -0.133  
## 19 neighbourhood_simp… -0.297    0.0130      -22.8  2.29e-113 -0.322    -0.271
model7 %>% glance() %>% 
  kbl () %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.492 0.492 0.563 999 0 18 -15668 31377 31533 5877 18552 18571

neighbourhood_simplified is a dummy variable as it has 5 categories - Ring 2, Ring 3, Ring 4, Ring 5 and Ring 6. PLEASE INTERPRET THIS MORE THANKS

With inclusion of these location variables, our adjusted R-squared has increased to 0.492. Let’s continue to improve our model further. From the perspective of a host who is setting prices in accordance with the time, money and effort he spends in managing the property, and from the perspective of a traveler who is booking the Airbnb and paying that price, some other variables worth considering are -

  1. cancellation policy
  2. review scores specifically for cleanliness
  3. security deposit amount
  4. whether the property is instant bookable
  5. amenities like wifi and breakfast
model8 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified + cancellation_policy,
             regression_data
            )

model8 %>% tidy(conf.int=TRUE)
## # A tibble: 21 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          6.93     0.0387      179.   0.         6.86      7.01   
##  2 prop_type_simplifi… -0.0366   0.0122       -3.00 2.74e-  3 -0.0605   -0.0126 
##  3 prop_type_simplifi…  0.0985   0.0139        7.06 1.69e- 12  0.0712    0.126  
##  4 prop_type_simplifi… -0.0289   0.0154       -1.88 6.07e-  2 -0.0590    0.00130
##  5 prop_type_simplifi…  0.265    0.0120       22.0  4.66e-106  0.241     0.288  
##  6 number_of_reviews   -0.00186  0.000209     -8.89 6.96e- 19 -0.00227  -0.00145
##  7 review_scores_rati…  0.00307  0.000386      7.95 2.04e- 15  0.00231   0.00383
##  8 room_typePrivate r… -0.420    0.00989     -42.5  0.        -0.440    -0.401  
##  9 room_typeShared ro… -0.933    0.0206      -45.3  0.        -0.973    -0.892  
## 10 bedrooms             0.0976   0.00715      13.6  3.72e- 42  0.0836    0.112  
## # … with 11 more rows
model8 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.494 0.493 0.562 904 0 20 -15644 31332 31505 5862 18550 18571

Cancellation policy

model9 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
              is_location_exact + neighbourhood_simplified + review_scores_cleanliness, 
             regression_data
            )

model9 %>% tidy(conf.int=TRUE)
## # A tibble: 20 x 7
##    term               estimate std.error statistic   p.value  conf.low conf.high
##    <chr>                 <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)         6.91     0.0410      168.   0.          6.83e+0   6.99   
##  2 prop_type_simplif… -0.0335   0.0122       -2.75 6.04e-  3  -5.75e-2  -0.00960
##  3 prop_type_simplif…  0.0989   0.0140        7.09 1.42e- 12   7.16e-2   0.126  
##  4 prop_type_simplif… -0.0274   0.0154       -1.78 7.55e-  2  -5.76e-2   0.00281
##  5 prop_type_simplif…  0.263    0.0120       21.8  4.17e-104   2.39e-1   0.286  
##  6 number_of_reviews  -0.00172  0.000208     -8.26 1.53e- 16  -2.13e-3  -0.00131
##  7 review_scores_rat…  0.00113  0.000622      1.81 6.98e-  2  -9.11e-5   0.00235
##  8 room_typePrivate … -0.422    0.00989     -42.7  0.         -4.41e-1  -0.403  
##  9 room_typeShared r… -0.931    0.0206      -45.1  0.         -9.71e-1  -0.890  
## 10 bedrooms            0.0971   0.00716      13.6  1.09e- 41   8.30e-2   0.111  
## 11 bathrooms           0.0355   0.00422       8.42 4.05e- 17   2.73e-2   0.0438 
## 12 beds               -0.0307   0.00331      -9.26 2.36e- 20  -3.71e-2  -0.0242 
## 13 accommodates        0.106    0.00302      35.2  2.39e-262   1.00e-1   0.112  
## 14 host_is_superhost…  0.0605   0.00905       6.68 2.46e- 11   4.27e-2   0.0782 
## 15 is_location_exact… -0.0796   0.00863      -9.22 3.26e- 20  -9.65e-2  -0.0627 
## 16 neighbourhood_sim… -0.203    0.0146      -13.9  8.59e- 44  -2.32e-1  -0.175  
## 17 neighbourhood_sim… -0.185    0.0131      -14.1  5.94e- 45  -2.10e-1  -0.159  
## 18 neighbourhood_sim… -0.207    0.0374       -5.53 3.28e-  8  -2.80e-1  -0.134  
## 19 neighbourhood_sim… -0.299    0.0130      -22.9  6.13e-115  -3.24e-1  -0.273  
## 20 review_scores_cle…  0.0256   0.00635       4.03 5.55e-  5   1.32e-2   0.0381
model9 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.493 0.492 0.563 948 0 19 -15658 31357 31522 5871 18548 18568

Cleanliness score - significant, but AIC and BIC is higher compared to when we use cancellation policy

model10 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified +  instant_bookable,
             regression_data
            )

model10 %>% tidy(conf.int=TRUE)
## # A tibble: 20 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          6.96     0.0389     179.    0.         6.88      7.04   
##  2 prop_type_simplifi… -0.0335   0.0122      -2.74  6.14e-  3 -0.0574   -0.00954
##  3 prop_type_simplifi…  0.0994   0.0140       7.12  1.11e- 12  0.0721    0.127  
##  4 prop_type_simplifi… -0.0269   0.0154      -1.74  8.12e-  2 -0.0571    0.00333
##  5 prop_type_simplifi…  0.264    0.0121      21.9   6.77e-105  0.240     0.287  
##  6 number_of_reviews   -0.00170  0.000208    -8.18  3.04e- 16 -0.00211  -0.00130
##  7 review_scores_rati…  0.00311  0.000387     8.03  1.04e- 15  0.00235   0.00386
##  8 room_typePrivate r… -0.422    0.00991    -42.6   0.        -0.441    -0.402  
##  9 room_typeShared ro… -0.934    0.0207     -45.2   0.        -0.974    -0.893  
## 10 bedrooms             0.0971   0.00716     13.6   1.15e- 41  0.0830    0.111  
## 11 bathrooms            0.0355   0.00422      8.40  4.64e- 17  0.0272    0.0437 
## 12 beds                -0.0307   0.00331     -9.28  1.93e- 20 -0.0372   -0.0242 
## 13 accommodates         0.106    0.00302     35.1   1.26e-261  0.100     0.112  
## 14 host_is_superhostT…  0.0632   0.00904      6.99  2.91e- 12  0.0454    0.0809 
## 15 is_location_exactT… -0.0794   0.00865     -9.17  5.14e- 20 -0.0963   -0.0624 
## 16 neighbourhood_simp… -0.203    0.0146     -13.9   2.08e- 43 -0.231    -0.174  
## 17 neighbourhood_simp… -0.184    0.0131     -14.1   1.11e- 44 -0.210    -0.159  
## 18 neighbourhood_simp… -0.206    0.0374      -5.51  3.55e-  8 -0.280    -0.133  
## 19 neighbourhood_simp… -0.297    0.0130     -22.7   5.39e-113 -0.322    -0.271  
## 20 instant_bookableTR…  0.00251  0.00891      0.282 7.78e-  1 -0.0150    0.0200
model10 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.492 0.492 0.563 947 0 19 -15668 31379 31543 5877 18551 18571

not significant. t stat is low

# using security deposit normally here
model11 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified +  security_deposit,
             regression_data
            )

model11 %>% tidy(conf.int=TRUE)
## # A tibble: 20 x 7
##    term               estimate  std.error statistic   p.value conf.low conf.high
##    <chr>                 <dbl>      <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)         6.96e+0 0.0385        181.   0.         6.89e+0   7.04e+0
##  2 prop_type_simpli…  -3.45e-2 0.0122         -2.83 4.63e-  3 -5.84e-2  -1.06e-2
##  3 prop_type_simpli…   9.95e-2 0.0139          7.14 9.79e- 13  7.22e-2   1.27e-1
##  4 prop_type_simpli…  -2.60e-2 0.0154         -1.69 9.03e-  2 -5.62e-2   4.09e-3
##  5 prop_type_simpli…   2.63e-1 0.0120         21.9  7.48e-105  2.40e-1   2.87e-1
##  6 number_of_reviews  -1.81e-3 0.000208       -8.72 3.07e- 18 -2.22e-3  -1.41e-3
##  7 review_scores_ra…   3.06e-3 0.000386        7.92 2.44e- 15  2.30e-3   3.82e-3
##  8 room_typePrivate…  -4.19e-1 0.00988       -42.4  0.        -4.38e-1  -3.99e-1
##  9 room_typeShared …  -9.29e-1 0.0206        -45.2  0.        -9.70e-1  -8.89e-1
## 10 bedrooms            9.68e-2 0.00715        13.5  1.33e- 41  8.28e-2   1.11e-1
## 11 bathrooms           3.54e-2 0.00421         8.41 4.48e- 17  2.72e-2   4.37e-2
## 12 beds               -3.03e-2 0.00331        -9.16 5.57e- 20 -3.68e-2  -2.38e-2
## 13 accommodates        1.06e-1 0.00302        35.0  2.02e-260  9.98e-2   1.12e-1
## 14 host_is_superhos…   6.20e-2 0.00901         6.88 6.22e- 12  4.43e-2   7.97e-2
## 15 is_location_exac…  -7.67e-2 0.00862        -8.89 6.65e- 19 -9.36e-2  -5.98e-2
## 16 neighbourhood_si…  -2.03e-1 0.0146        -13.9  5.98e- 44 -2.32e-1  -1.75e-1
## 17 neighbourhood_si…  -1.86e-1 0.0131        -14.2  9.84e- 46 -2.12e-1  -1.60e-1
## 18 neighbourhood_si…  -2.08e-1 0.0374         -5.56 2.74e-  8 -2.81e-1  -1.34e-1
## 19 neighbourhood_si…  -2.95e-1 0.0130        -22.7  1.43e-112 -3.20e-1  -2.69e-1
## 20 security_deposit    2.39e-5 0.00000273      8.78 1.80e- 18  1.86e-5   2.93e-5
model11 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.494 0.494 0.562 955 0 19 -15630 31302 31466 5853 18551 18571
# using log of security deposit instead as it is a highly skewed variable
model12 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified +  log(security_deposit + 0.001),
             regression_data
            )

model12 %>% tidy(conf.int=TRUE)
## # A tibble: 20 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          7.00     0.0386      182.   0.         6.93     7.08    
##  2 prop_type_simplifi… -0.0362   0.0122       -2.97 2.94e-  3 -0.0601  -0.0124  
##  3 prop_type_simplifi…  0.103    0.0139        7.43 1.10e- 13  0.0762   0.131   
##  4 prop_type_simplifi… -0.0295   0.0154       -1.92 5.48e-  2 -0.0596   0.000608
##  5 prop_type_simplifi…  0.268    0.0120       22.3  9.48e-109  0.244    0.291   
##  6 number_of_reviews   -0.00190  0.000208     -9.11 8.91e- 20 -0.00230 -0.00149 
##  7 review_scores_rati…  0.00291  0.000386      7.53 5.22e- 14  0.00215  0.00366 
##  8 room_typePrivate r… -0.412    0.00989     -41.6  0.        -0.431   -0.393   
##  9 room_typeShared ro… -0.914    0.0206      -44.3  0.        -0.954   -0.874   
## 10 bedrooms             0.0960   0.00714      13.5  4.23e- 41  0.0820   0.110   
## 11 bathrooms            0.0352   0.00421       8.36 6.69e- 17  0.0269   0.0434  
## 12 beds                -0.0301   0.00330      -9.12 8.45e- 20 -0.0366  -0.0236  
## 13 accommodates         0.106    0.00301      35.0  1.54e-260  0.0997   0.112   
## 14 host_is_superhostT…  0.0569   0.00901       6.31 2.81e- 10  0.0392   0.0746  
## 15 is_location_exactT… -0.0729   0.00862      -8.46 2.87e- 17 -0.0898  -0.0560  
## 16 neighbourhood_simp… -0.201    0.0146      -13.8  4.78e- 43 -0.229   -0.172   
## 17 neighbourhood_simp… -0.187    0.0131      -14.3  3.22e- 46 -0.212   -0.161   
## 18 neighbourhood_simp… -0.211    0.0373       -5.67 1.49e-  8 -0.284   -0.138   
## 19 neighbourhood_simp… -0.290    0.0130      -22.3  6.19e-109 -0.315   -0.264   
## 20 log(security_depos…  0.00831  0.000705     11.8  5.91e- 32  0.00693  0.00969
model12 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.496 0.496 0.561 961 0 19 -15599 31240 31404 5834 18551 18571

log is better because lower AIC and BIC, and higher adjusted R-squared

# amenities - try three models - just wifi, just breakfast, both wifi and breakfast

model13 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified +  wifi,
             regression_data
            )

model13 %>% tidy(conf.int=TRUE)
## # A tibble: 20 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          6.83     0.0462      148.   0.         6.74      6.92   
##  2 prop_type_simplifi… -0.0345   0.0122       -2.83 4.69e-  3 -0.0585   -0.0106 
##  3 prop_type_simplifi…  0.0994   0.0140        7.12 1.10e- 12  0.0720    0.127  
##  4 prop_type_simplifi… -0.0284   0.0154       -1.85 6.50e-  2 -0.0586    0.00177
##  5 prop_type_simplifi…  0.264    0.0120       21.9  3.15e-105  0.240     0.288  
##  6 number_of_reviews   -0.00175  0.000208     -8.40 4.88e- 17 -0.00216  -0.00134
##  7 review_scores_rati…  0.00303  0.000387      7.83 5.16e- 15  0.00227   0.00379
##  8 room_typePrivate r… -0.422    0.00989     -42.7  0.        -0.442    -0.403  
##  9 room_typeShared ro… -0.933    0.0206      -45.3  0.        -0.974    -0.893  
## 10 bedrooms             0.0974   0.00716      13.6  5.09e- 42  0.0834    0.111  
## 11 bathrooms            0.0354   0.00422       8.39 5.21e- 17  0.0271    0.0437 
## 12 beds                -0.0308   0.00331      -9.31 1.40e- 20 -0.0373   -0.0243 
## 13 accommodates         0.106    0.00302      35.0  1.98e-260  0.100     0.112  
## 14 host_is_superhostT…  0.0621   0.00903       6.87 6.45e- 12  0.0444    0.0797 
## 15 is_location_exactT… -0.0788   0.00863      -9.13 7.57e- 20 -0.0957   -0.0619 
## 16 neighbourhood_simp… -0.201    0.0146      -13.7  9.99e- 43 -0.229    -0.172  
## 17 neighbourhood_simp… -0.184    0.0131      -14.0  1.54e- 44 -0.209    -0.158  
## 18 neighbourhood_simp… -0.207    0.0374       -5.52 3.42e-  8 -0.280    -0.133  
## 19 neighbourhood_simp… -0.295    0.0130      -22.7  3.50e-112 -0.320    -0.269  
## 20 wifiTRUE             0.145    0.0277        5.23 1.74e-  7  0.0905    0.199
model13 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.493 0.492 0.562 949 0 19 -15655 31351 31516 5869 18551 18571
model14 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified +  breakfast,
             regression_data
            )

model14 %>% tidy(conf.int=TRUE)
## # A tibble: 20 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          6.98     0.0382      183.   0.         6.90      7.05   
##  2 prop_type_simplifi… -0.0276   0.0121       -2.28 2.26e-  2 -0.0513   -0.00387
##  3 prop_type_simplifi…  0.0952   0.0138        6.88 5.99e- 12  0.0681    0.122  
##  4 prop_type_simplifi… -0.0175   0.0153       -1.15 2.52e-  1 -0.0474    0.0124 
##  5 prop_type_simplifi…  0.226    0.0121       18.7  2.73e- 77  0.202     0.250  
##  6 number_of_reviews   -0.00176  0.000206     -8.55 1.34e- 17 -0.00217  -0.00136
##  7 review_scores_rati…  0.00298  0.000383      7.77 8.55e- 15  0.00222   0.00373
##  8 room_typePrivate r… -0.449    0.00991     -45.3  0.        -0.469    -0.430  
##  9 room_typeShared ro… -0.958    0.0205      -46.8  0.        -0.998    -0.918  
## 10 bedrooms             0.0955   0.00709      13.5  3.84e- 41  0.0816    0.109  
## 11 bathrooms            0.0320   0.00419       7.63 2.37e- 14  0.0238    0.0402 
## 12 beds                -0.0308   0.00328      -9.39 6.60e- 21 -0.0373   -0.0244 
## 13 accommodates         0.105    0.00300      35.1  2.12e-261  0.0993    0.111  
## 14 host_is_superhostT…  0.0634   0.00895       7.09 1.42e- 12  0.0459    0.0809 
## 15 is_location_exactT… -0.0708   0.00857      -8.27 1.43e- 16 -0.0876   -0.0540 
## 16 neighbourhood_simp… -0.201    0.0145      -13.9  1.58e- 43 -0.229    -0.172  
## 17 neighbourhood_simp… -0.184    0.0130      -14.2  2.51e- 45 -0.209    -0.158  
## 18 neighbourhood_simp… -0.206    0.0371       -5.57 2.65e-  8 -0.279    -0.134  
## 19 neighbourhood_simp… -0.319    0.0129      -24.6  8.48e-132 -0.344    -0.293  
## 20 breakfastTRUE        0.267    0.0142       18.9  1.40e- 78  0.239     0.295
model14 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.502 0.501 0.558 983 0 19 -15492 31026 31190 5767 18551 18571
model15 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified + wifi + breakfast,
             regression_data
            )

model15 %>% tidy(conf.int=TRUE)
## # A tibble: 21 x 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)           6.86     0.0458      150.   0.        6.77      6.95   
##  2 prop_type_simplifie… -0.0286   0.0121       -2.36 1.80e- 2 -0.0523   -0.00490
##  3 prop_type_simplifie…  0.0952   0.0138        6.88 6.04e-12  0.0681    0.122  
##  4 prop_type_simplifie… -0.0190   0.0153       -1.24 2.14e- 1 -0.0489    0.0109 
##  5 prop_type_simplifie…  0.226    0.0121       18.7  1.52e-77  0.203     0.250  
##  6 number_of_reviews    -0.00180  0.000206     -8.73 2.88e-18 -0.00220  -0.00140
##  7 review_scores_rating  0.00291  0.000383      7.59 3.44e-14  0.00216   0.00366
##  8 room_typePrivate ro… -0.449    0.00990     -45.4  0.       -0.469    -0.430  
##  9 room_typeShared room -0.958    0.0205      -46.8  0.       -0.998    -0.918  
## 10 bedrooms              0.0959   0.00709      13.5  1.80e-41  0.0820    0.110  
## # … with 11 more rows
model15 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.502 0.502 0.557 936 0 20 -15481 31005 31178 5760 18550 18571
# checking other review scores
model16 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified + reviews_per_month,
             regression_data
            )

model16 %>% tidy(conf.int=TRUE)
## # A tibble: 20 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          6.96e+0  0.0385      181.   0.         6.88     7.03    
##  2 prop_type_simplifi… -3.18e-2  0.0122       -2.60 9.22e-  3 -0.0558  -0.00787 
##  3 prop_type_simplifi…  1.01e-1  0.0140        7.21 5.66e- 13  0.0733   0.128   
##  4 prop_type_simplifi… -2.23e-2  0.0154       -1.45 1.48e-  1 -0.0525   0.00794 
##  5 prop_type_simplifi…  2.64e-1  0.0120       21.9  2.58e-105  0.240    0.288   
##  6 number_of_reviews   -3.83e-4  0.000344     -1.11 2.65e-  1 -0.00106  0.000291
##  7 review_scores_rati…  3.21e-3  0.000387      8.28 1.28e- 16  0.00245  0.00396 
##  8 room_typePrivate r… -4.25e-1  0.00991     -42.9  0.        -0.444   -0.406   
##  9 room_typeShared ro… -9.43e-1  0.0207      -45.6  0.        -0.983   -0.902   
## 10 bedrooms             9.62e-2  0.00716      13.4  5.75e- 41  0.0822   0.110   
## 11 bathrooms            3.63e-2  0.00422       8.59 9.21e- 18  0.0280   0.0445  
## 12 beds                -3.08e-2  0.00331      -9.30 1.53e- 20 -0.0373  -0.0243  
## 13 accommodates         1.06e-1  0.00302      35.1  1.59e-261  0.100    0.112   
## 14 host_is_superhostT…  7.42e-2  0.00930       7.98 1.57e- 15  0.0560   0.0925  
## 15 is_location_exactT… -7.28e-2  0.00873      -8.33 8.43e- 17 -0.0899  -0.0556  
## 16 neighbourhood_simp… -2.01e-1  0.0146      -13.8  5.36e- 43 -0.230   -0.173   
## 17 neighbourhood_simp… -1.85e-1  0.0131      -14.1  4.88e- 45 -0.211   -0.159   
## 18 neighbourhood_simp… -2.08e-1  0.0374       -5.57 2.58e-  8 -0.282   -0.135   
## 19 neighbourhood_simp… -2.99e-1  0.0130      -23.0  5.67e-115 -0.324   -0.273   
## 20 reviews_per_month   -4.25e-2  0.00877      -4.84 1.28e-  6 -0.0597  -0.0253
model16 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.493 0.492 0.563 949 0 19 -15657 31355 31520 5870 18551 18571
model17 <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + bathrooms + beds + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified + calculated_host_listings_count,
             regression_data
            )

model17 %>% tidy(conf.int=TRUE)
## # A tibble: 20 x 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)          6.93     0.0396      175.   0.         6.85e+0   7.01   
##  2 prop_type_simplifi… -0.0308   0.0122       -2.52 1.18e-  2 -5.48e-2  -0.00682
##  3 prop_type_simplifi…  0.103    0.0140        7.36 1.97e- 13  7.55e-2   0.130  
##  4 prop_type_simplifi… -0.0279   0.0154       -1.81 7.00e-  2 -5.81e-2   0.00229
##  5 prop_type_simplifi…  0.266    0.0121       22.1  9.84e-107  2.43e-1   0.290  
##  6 number_of_reviews   -0.00164  0.000209     -7.86 4.11e- 15 -2.05e-3  -0.00123
##  7 review_scores_rati…  0.00327  0.000389      8.40 4.69e- 17  2.51e-3   0.00403
##  8 room_typePrivate r… -0.419    0.00991     -42.3  0.        -4.39e-1  -0.400  
##  9 room_typeShared ro… -0.931    0.0206      -45.2  0.        -9.72e-1  -0.891  
## 10 bedrooms             0.0976   0.00716      13.6  3.97e- 42  8.36e-2   0.112  
## 11 bathrooms            0.0360   0.00422       8.53 1.53e- 17  2.78e-2   0.0443 
## 12 beds                -0.0305   0.00331      -9.20 3.93e- 20 -3.70e-2  -0.0240 
## 13 accommodates         0.106    0.00303      34.9  3.36e-258  9.97e-2   0.112  
## 14 host_is_superhostT…  0.0641   0.00903       7.09 1.35e- 12  4.64e-2   0.0818 
## 15 is_location_exactT… -0.0815   0.00865      -9.42 5.22e- 21 -9.85e-2  -0.0645 
## 16 neighbourhood_simp… -0.199    0.0146      -13.6  7.72e- 42 -2.28e-1  -0.170  
## 17 neighbourhood_simp… -0.180    0.0131      -13.7  1.47e- 42 -2.06e-1  -0.154  
## 18 neighbourhood_simp… -0.205    0.0374       -5.47 4.62e-  8 -2.78e-1  -0.131  
## 19 neighbourhood_simp… -0.292    0.0131      -22.3  3.36e-109 -3.18e-1  -0.266  
## 20 calculated_host_li…  0.00134  0.000365      3.66 2.53e-  4  6.20e-4   0.00205
model17 %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.493 0.492 0.563 948 0 19 -15662 31365 31530 5873 18551 18571
# summary table to compare last few models
huxreg(model8, model9, model10, model11, model12, model13, model14, model15,
       statistics = c('#observations' = 'nobs', 
                      'R squared' = 'r.squared', 
                      'Adj. R Squared' = 'adj.r.squared', 
                      'Residual SE' = 'sigma'), 
       bold_signif = 0.05, 
       stars = NULL
) %>% 
  set_caption('Comparison of Models 3.0')
Comparison of Models 3.0
(1)(2)(3)(4)(5)(6)(7)(8)
(Intercept)6.935 6.908 6.961 6.961 7.005 6.829 6.977 6.857 
(0.039)(0.041)(0.039)(0.038)(0.039)(0.046)(0.038)(0.046)
prop_type_simplifiedCondominium-0.037 -0.034 -0.033 -0.035 -0.036 -0.035 -0.028 -0.029 
(0.012)(0.012)(0.012)(0.012)(0.012)(0.012)(0.012)(0.012)
prop_type_simplifiedHouse0.098 0.099 0.099 0.099 0.103 0.099 0.095 0.095 
(0.014)(0.014)(0.014)(0.014)(0.014)(0.014)(0.014)(0.014)
prop_type_simplifiedLoft-0.029 -0.027 -0.027 -0.026 -0.029 -0.028 -0.018 -0.019 
(0.015)(0.015)(0.015)(0.015)(0.015)(0.015)(0.015)(0.015)
prop_type_simplifiedOther0.265 0.263 0.264 0.263 0.268 0.264 0.226 0.226 
(0.012)(0.012)(0.012)(0.012)(0.012)(0.012)(0.012)(0.012)
number_of_reviews-0.002 -0.002 -0.002 -0.002 -0.002 -0.002 -0.002 -0.002 
(0.000)(0.000)(0.000)(0.000)(0.000)(0.000)(0.000)(0.000)
review_scores_rating0.003 0.001 0.003 0.003 0.003 0.003 0.003 0.003 
(0.000)(0.001)(0.000)(0.000)(0.000)(0.000)(0.000)(0.000)
room_typePrivate room-0.420 -0.422 -0.422 -0.419 -0.412 -0.422 -0.449 -0.449 
(0.010)(0.010)(0.010)(0.010)(0.010)(0.010)(0.010)(0.010)
room_typeShared room-0.933 -0.931 -0.934 -0.929 -0.914 -0.933 -0.958 -0.958 
(0.021)(0.021)(0.021)(0.021)(0.021)(0.021)(0.020)(0.020)
bedrooms0.098 0.097 0.097 0.097 0.096 0.097 0.096 0.096 
(0.007)(0.007)(0.007)(0.007)(0.007)(0.007)(0.007)(0.007)
bathrooms0.035 0.036 0.035 0.035 0.035 0.035 0.032 0.032 
(0.004)(0.004)(0.004)(0.004)(0.004)(0.004)(0.004)(0.004)
beds-0.030 -0.031 -0.031 -0.030 -0.030 -0.031 -0.031 -0.031 
(0.003)(0.003)(0.003)(0.003)(0.003)(0.003)(0.003)(0.003)
accommodates0.105 0.106 0.106 0.106 0.106 0.106 0.105 0.105 
(0.003)(0.003)(0.003)(0.003)(0.003)(0.003)(0.003)(0.003)
host_is_superhostTRUE0.055 0.060 0.063 0.062 0.057 0.062 0.063 0.062 
(0.009)(0.009)(0.009)(0.009)(0.009)(0.009)(0.009)(0.009)
is_location_exactTRUE-0.077 -0.080 -0.079 -0.077 -0.073 -0.079 -0.071 -0.070 
(0.009)(0.009)(0.009)(0.009)(0.009)(0.009)(0.009)(0.009)
neighbourhood_simplifiedRing 3-0.199 -0.203 -0.203 -0.203 -0.201 -0.201 -0.201 -0.199 
(0.015)(0.015)(0.015)(0.015)(0.015)(0.015)(0.014)(0.014)
neighbourhood_simplifiedRing 4-0.182 -0.185 -0.184 -0.186 -0.187 -0.184 -0.184 -0.183 
(0.013)(0.013)(0.013)(0.013)(0.013)(0.013)(0.013)(0.013)
neighbourhood_simplifiedRing 5-0.204 -0.207 -0.206 -0.208 -0.211 -0.207 -0.206 -0.206 
(0.037)(0.037)(0.037)(0.037)(0.037)(0.037)(0.037)(0.037)
neighbourhood_simplifiedRing 6-0.288 -0.299 -0.297 -0.295 -0.290 -0.295 -0.319 -0.317 
(0.013)(0.013)(0.013)(0.013)(0.013)(0.013)(0.013)(0.013)
cancellation_policymoderate0.055                                    
(0.010)                                   
cancellation_policystrict_14_with_grace_period0.066                                    
(0.011)                                   
review_scores_cleanliness     0.026                               
     (0.006)                              
instant_bookableTRUE          0.003                          
          (0.009)                         
security_deposit               0.000                     
               (0.000)                    
log(security_deposit + 0.001)                    0.008                
                    (0.001)               
wifiTRUE                         0.145      0.131 
                         (0.028)     (0.027)
breakfastTRUE                              0.267 0.265 
                              (0.014)(0.014)
#observations18571     18568     18571     18571     18571     18571     18571     18571     
R squared0.494 0.493 0.492 0.494 0.496 0.493 0.502 0.502 
Adj. R Squared0.493 0.492 0.492 0.494 0.496 0.492 0.501 0.502 
Residual SE0.562 0.563 0.563 0.562 0.561 0.562 0.558 0.557 
########### https://www.displayr.com/variance-inflation-factors-vifs/ USE THIS TO EXPLAIN - ex: beds/baths/accommodates - but none of the VIFs is high enough to suggest collinearity so we're good

Conclusion : should definitely include log(security_deposit)

final_model <- lm(price_4_nights ~ 
               prop_type_simplified + number_of_reviews + review_scores_rating + 
               room_type + bedrooms + beds + bathrooms + accommodates + host_is_superhost +
               is_location_exact + neighbourhood_simplified +  
               cancellation_policy + log(security_deposit + 0.001) +
               wifi + breakfast,
             regression_data
            )

final_model %>% tidy(conf.int=TRUE)
## # A tibble: 24 x 7
##    term                 estimate std.error statistic  p.value conf.low conf.high
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
##  1 (Intercept)           6.88     0.0459      150.   0.        6.79      6.97   
##  2 prop_type_simplifie… -0.0335   0.0121       -2.78 5.40e- 3 -0.0572   -0.00992
##  3 prop_type_simplifie…  0.0981   0.0138        7.12 1.08e-12  0.0711    0.125  
##  4 prop_type_simplifie… -0.0230   0.0152       -1.51 1.31e- 1 -0.0528    0.00683
##  5 prop_type_simplifie…  0.230    0.0120       19.1  7.04e-81  0.207     0.254  
##  6 number_of_reviews    -0.00209  0.000207    -10.1  6.28e-24 -0.00249  -0.00168
##  7 review_scores_rating  0.00270  0.000382      7.06 1.74e-12  0.00195   0.00344
##  8 room_typePrivate ro… -0.439    0.00989     -44.4  0.       -0.459    -0.420  
##  9 room_typeShared room -0.938    0.0204      -45.9  0.       -0.978    -0.898  
## 10 bedrooms              0.0953   0.00706      13.5  2.45e-41  0.0815    0.109  
## # … with 14 more rows
final_model %>% glance() %>% 
  kbl() %>% 
  kable_styling()
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual nobs
0.507 0.506 0.555 829 0 23 -15394 30839 31034 5707 18547 18571
vif(final_model)
##                               GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified          1.42  4            1.04
## number_of_reviews             1.15  1            1.07
## review_scores_rating          1.06  1            1.03
## room_type                     1.34  2            1.08
## bedrooms                      4.46  1            2.11
## beds                          3.12  1            1.77
## bathrooms                     1.64  1            1.28
## accommodates                  4.50  1            2.12
## host_is_superhost             1.15  1            1.07
## is_location_exact             1.09  1            1.04
## neighbourhood_simplified      1.40  4            1.04
## cancellation_policy           1.11  2            1.03
## log(security_deposit + 0.001) 1.07  1            1.04
## wifi                          1.01  1            1.00
## breakfast                     1.15  1            1.07

4.2.4 Diagnostics, collinearity, summary tables

autoplot(final_model)

reading_week <- regression_data %>% filter(prop_type_simplified==“Apartment”, room_type==“Private room”, number_of_reviews >=10, review_scores_rating >=90)

reading_week

set.seed(6789)

train_test_split <- initial_split(reading_week, prop=0.75) reading_week_train <- training(train_test_split) reading_week_test <- testing(train_test_split)

rmse_train <- reading_week_train %>% mutate( predictions = predict(model1, .) ) %>% summarise( sqrt(sum(predictions - price_4_nights)**2/n())) %>% pull()

rmse_train

rmse_test <- reading_week_test %>% mutate(predictions = predict(model1, .)) %>% summarise( sqrt(sum(predictions - price_4_nights)**2/n())) %>% pull()

rmse_test

```